In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
from sklearn import preprocessing
import subprocess
import matplotlib.pyplot as plt
from itertools import islice
import re
In [2]:
%matplotlib inline
In [3]:
pdf_file_path = 'parsers/pdfs/West Bengal/2017-18/2017_bp11_Demand Nos.1-5.pdf'
In [4]:
pdf = PdfFileReader(open(pdf_file_path, 'rb'))
In [5]:
def get_page_image_from_pdf(pdf, page_num, image_file_name):
page_layout = pdf.getPage(page_num)['/MediaBox']
command = "convert -density 300 '%s'[%s] '%s'" % (pdf_file_path,
page_num,
image_file_name)
subprocess.check_output(command, shell=True)
return cv2.imread(image_file_name, 0)
In [6]:
img_page2 = get_page_image_from_pdf(pdf, 2, 'west_bengal_demand_1_5_page_1.png')
img_page4 = get_page_image_from_pdf(pdf, 4, 'west_bengal_demand_1_5_page_1.png')
In [7]:
plt.figure(figsize=(30,20))
plt.imshow(img_page2, cmap='gray')
Out[7]:
In [8]:
plt.figure(figsize=(30,20))
plt.imshow(img_page4, cmap='gray')
Out[8]:
http://crblpocr.blogspot.in/2007/06/run-length-smoothing-algorithm-rlsa.html
The Run Length Smoothing Algorithm (RLSA) is a method that can be used for Block segmentation and text discrimination. The method developed for the Document Analysis System consists of two steps. First, a segmentation procedure subdivides the area of a document into regions (blocks), each of which should contain only one type of data (text, graphic, halftone image, etc.). Next, some basic features of these blocks are calculated.
The basic RLSA is applied to a binary sequence in which white pixels are represented by 0’s and black pixels by 1’s. The algorithm transforms a binary sequence x into an output sequence y according to the following rules:
For example, with C = 4 the sequence x is mapped into y as follows:
x : 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0
y : 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1
In [9]:
ret,thresh1 = cv2.threshold(img_page2,0,1,cv2.THRESH_BINARY_INV)
plt.figure(figsize=(30,20))
plt.imshow(thresh1, cmap='gray')
Out[9]:
In [10]:
def rlsa(img, threshold):
ret,thresh1 = cv2.threshold(img, 0, 1, cv2.THRESH_BINARY_INV)
img_iter = np.nditer(thresh1, flags=['multi_index'])
C_vertical, C_horizontal = threshold
temp_thresh = thresh1.copy()
while not img_iter.finished:
x, y = img_iter.multi_index
x_threshold = x + C_horizontal
y_threshold = y + C_vertical
neg_x_threshold = x - C_horizontal
neg_y_threshold = y - C_vertical
if (thresh1[x:x_threshold, y:y_threshold].any()
or thresh1[x:x_threshold, y:neg_y_threshold].any()
or thresh1[x:neg_x_threshold, y:y_threshold].any()
or thresh1[x:neg_x_threshold, y:neg_y_threshold].any()):
temp_thresh[x, y] = 1
else:
temp_thresh[x, y] = 0
img_iter.iternext()
return temp_thresh
def plot_page(img):
plt.figure(figsize=(30,20))
plt.imshow(img, cmap='gray')
In [11]:
img_page4_rlsa = rlsa(img_page4, (15, 25))
In [12]:
plot_page(img_page4_rlsa)
In [13]:
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_page4_rlsa)
In [14]:
n_comp
Out[14]:
In [15]:
plot_page(labels)
In [16]:
stats_columns = ["left", "top", "width", "height", "area"]
label_stats = pd.DataFrame(stats, columns=stats_columns)
label_stats['centroid_x'], label_stats['centroid_y'] = centroids[:, 0], centroids[:, 1]
# Ignore the label 0 since it is the background
label_stats.drop(0, inplace=True)
In [17]:
label_stats.head()
Out[17]:
In [18]:
plt.scatter(label_stats.width, label_stats.height)
Out[18]:
In [19]:
label_stats.top.sort_index(ascending=False)
Out[19]:
In [20]:
min_max_scaler = preprocessing.MinMaxScaler()
plt.figure(figsize=(10,20))
plt.scatter(label_stats.top, label_stats.left, s=min_max_scaler.fit_transform(label_stats.area) * 200)
Out[20]:
In [21]:
label_stats[label_stats.top < 500]
Out[21]:
In [22]:
label_stats.iloc[36:46]
Out[22]:
In [23]:
centroids[36:46]
Out[23]:
In [24]:
waste_copy = img_page4.copy()
for centroid in centroids[36:46]:
x, y = centroid
cv2.circle(waste_copy, (int(x), int(y)), 20, (0,0,255), -1)
plot_page(waste_copy)
In [25]:
label_stats['top_str'] = label_stats.top.apply(lambda x: str(x))
In [26]:
count_top = label_stats.groupby('top_str')['left'].count()
count_top[count_top == 5]
Out[26]:
In [27]:
grouped_top = label_stats.groupby('top_str').apply(lambda x: x.iloc[0:10]).sort_values('top')
grouped_top[grouped_top.top_str == '1258']
Out[27]:
In [28]:
(label_stats.left - (label_stats.width // 2) ).iloc[12:16].tolist()
Out[28]:
In [29]:
(label_stats.left + label_stats.width + 10 ).iloc[12:16].tolist()
Out[29]:
In [30]:
(label_stats.left + label_stats.width + 10)[61:66].tolist()
Out[30]:
In [31]:
# draw lines at 1411, 1700, 1989, 2278
# left - width [1102, 1390, 1679, 1968]
# left - width / 2 [1189, 1477, 1766, 2055]
def plot_lines(line_xs, img, y_start, y_end):
waste_copy2 = img.copy()
for line_x in line_xs:
line_starting_points = (line_x, y_start)
line_stopping_points = (line_x, y_end)
cv2.line(waste_copy2, line_starting_points, line_stopping_points, (125,255,0), 3)
plot_page(waste_copy2)
plot_lines([1461, 1750, 2039, 2328], img_page4, 333, 3000)
In [32]:
plot_lines([1000,1200,1461, 1750, 2039, 2328], img_page4, 333, 3000)
In [33]:
rlsa_page2 = rlsa(img_page2, (20, 25))
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(rlsa_page2)
In [34]:
def get_label_stats(stats, centroids):
stats_columns = ["left", "top", "width", "height", "area"]
label_stats = pd.DataFrame(stats, columns=stats_columns)
label_stats['centroid_x'], label_stats['centroid_y'] = centroids[:, 0], centroids[:, 1]
# Ignore the label 0 since it is the background
label_stats.drop(0, inplace=True)
return label_stats
page2_label_stats = get_label_stats(stats, centroids)
In [35]:
page2_label_stats['top_str'] = page2_label_stats.top.apply(lambda x: str(x))
page2_label_stats.groupby('top_str').apply(lambda x: x.iloc[0:10]).sort_values('top')
Out[35]:
In [36]:
line_xs = (page2_label_stats.left + page2_label_stats.width + 20).iloc[125:130].tolist()
plot_lines(line_xs, img_page2, 333, 3000)
In [37]:
line_xs
Out[37]:
Need to figure out column selection that would work on both with and without columnar lines.
Left - Width
will not work as it varies with document pages and needs to be adjusted.
Left + Width + Buffer
works :D. Since its numbers
In [38]:
eroded_image = cv2.erode(img_page2, kernel = np.ones((5,5),np.uint8), iterations=2)
rlsa_eroded_image = rlsa(eroded_image, (15, 25))
plot_page(rlsa_eroded_image)
In [39]:
eroded_image_page4 = cv2.erode(img_page4, kernel = np.ones((5,5),np.uint8), iterations=2)
rlsa_eroded_image_page4 = rlsa(eroded_image_page4, (15, 25))
plot_page(rlsa_eroded_image_page4)
In [40]:
waste_copy = img_page4.copy()
for index, row in label_stats[(label_stats.width > 1250) & (label_stats.top > 0)].iterrows():
cv2.line(waste_copy, (20, row['top']), (2400, row['top']), (0,255,0), 5)
plot_page(waste_copy)
In [41]:
waste_copy = img_page2.copy()
for index, row in page2_label_stats[(page2_label_stats.width > 1200) & (page2_label_stats.top > 0)].iterrows():
cv2.line(waste_copy, (20, row['top']), (2400, row['top']), (0,255,0), 5)
plot_page(waste_copy)
In [42]:
# the current method
def draw_table_bounds(image_gray):
table_bounds = None
temp_image, contours, hierarchy = cv2.findContours(image_gray,
cv2.RETR_LIST,
cv2.CHAIN_APPROX_SIMPLE)
best_match_contour_index = None
max_contour_size = 0
count = 0
for contour in contours:
if cv2.contourArea(contour) > max_contour_size:
contour_size = cv2.contourArea(contour)
x, y, w, h = cv2.boundingRect(contour)
if x > 0 and y > 0 and contour_size > max_contour_size:
best_match_contour_index = count
max_contour_size = contour_size
count += 1
if best_match_contour_index:
x, y, w, h = cv2.boundingRect(contours[best_match_contour_index])
x = x - 10
w = w + 10
cv2.rectangle(image_gray, (x, y), (x+w, y+h), (125, 125, 0), 5)
cv2.rectangle(image_gray, (x, y), (x+w, y+h), (125, 125, 0), 5)
return image_gray
In [43]:
# plot_page(draw_table_bounds(img_page4))
Detect similarly shaped blocks to merge together.
How to detect similar blocks ??
- Vertical
- Similar left most points
- Similar right most points
- Horizontal
- Similar top most points
- Similar bottom most points
- Shorter then the parent or similar width.
In [44]:
page2_label_stats['right'] = page2_label_stats.left + page2_label_stats.width
page2_label_stats['bottom'] = page2_label_stats.top + page2_label_stats.height
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height
In [45]:
page2_label_stats
Out[45]:
In [46]:
label_2_label_map = np.zeros((page2_label_stats.shape[0], page2_label_stats.shape[0]))
In [47]:
label_2_label_map[0]
Out[47]:
In [48]:
for index, value in enumerate(label_2_label_map[10]):
base_label = page2_label_stats.iloc[13]
print(base_label)
similarly_aligned = page2_label_stats[(page2_label_stats.top > base_label.top) &
(page2_label_stats.right.between(base_label.right - 10, base_label.right + 10)) &
(page2_label_stats.width.between(base_label.width - 20, base_label.width + 20))]
print(similarly_aligned)
similarly_aligned = similarly_aligned.append(base_label)
break
In [49]:
similarly_aligned.index
Out[49]:
In [50]:
page_layout = pdf.getPage(3)['/MediaBox']
if '/Rotate' in pdf.getPage(3) and pdf.getPage(3)['/Rotate'] == 90:
page_width = float(page_layout[3])
page_height = float(page_layout[2])
else:
page_width = float(page_layout[2])
page_height = float(page_layout[3])
In [51]:
image_height, image_width = img_page2.shape
horizontal_ratio = page_width/image_width
vertical_ratio = page_height/image_height
In [52]:
waste_copy = img_page2.copy()
for index, row in page2_label_stats[page2_label_stats.index.isin(similarly_aligned.index)].iterrows():
print(row.left * horizontal_ratio, row.top * vertical_ratio)
print(row.height * vertical_ratio, row.width * horizontal_ratio)
cv2.rectangle(waste_copy, (row['left'], row['top']), (row['right'], row['bottom']), (125,125,0), 5)
plot_page(waste_copy)
In [53]:
def find_similar_blocks(row, label_stats, similarity_map):
similarly_aligned = label_stats[(label_stats.top > row.top) &
(label_stats.right.between(row.right - 10, row.right + 10)) &
(label_stats.width.between(row.width - 20, row.width + 20))]
positions = [[row['pos'] - 1] * len(similarly_aligned), similarly_aligned.index.tolist()]
similarity_map[positions] = 1
return None
In [54]:
def add_similarity_map(label_stats):
similarity_map = np.zeros((label_stats.shape[0], label_stats.shape[0]))
label_stats.apply(find_similar_blocks, args=[label_stats, similarity_map], axis=1)
similarity_df = pd.DataFrame(similarity_map, columns=['label_{0}'.format(index) for index in range(label_stats.shape[0])],
index=label_stats.index)
similarity_df['count'] = similarity_df.apply(sum, axis=1)
return pd.concat([label_stats, similarity_df], axis=1)
In [55]:
page2_label_stats['pos'] = page2_label_stats.index
similar_labels = add_similarity_map(page2_label_stats)
In [56]:
similar_labels[(similar_labels.left > 0) & (similar_labels['count'] > 1)]
Out[56]:
In [57]:
similar_label_indices = similar_labels[(similar_labels.left > 0) & (similar_labels['count'] > 1)].index.tolist()
parent_label_indices = {}
marked_label_indices = []
In [58]:
# for index in similar_label_indices:
Sudden Realization, I might not need what I am currently doing for table boudaries and columns!!
Stupendo...
In [59]:
waste_copy = img_page2.copy()
for index, row in page2_label_stats[(page2_label_stats.width > 1200) & (page2_label_stats.top > 0)].iterrows():
cv2.line(waste_copy, (row['left'], row['top']), (row['right'] + 50, row['top']), (0,255,0), 5)
plot_page(waste_copy)
In [60]:
waste_copy = img_page4.copy()
for index, row in label_stats[(label_stats.width > 1200) & (label_stats.top > 0)].iterrows():
cv2.line(waste_copy, (row['left'], row['top']), (row['left'] + row['width'] + 50, row['top']), (0,255,0), 5)
plot_page(waste_copy)
In the same process remove lines that are in the range of 100 points vertically
and join the left most and right most points of two independent lines
In [61]:
possible_boundary_lines = page2_label_stats[(page2_label_stats.width > 1200) & (page2_label_stats.top > 0)]
possible_boundary_lines
Out[61]:
In [62]:
possible_boundary_lines['dist_from_next_line'] = possible_boundary_lines.top.shift(-1) - possible_boundary_lines.top
possible_boundary_lines['dist_from_prev_line'] = possible_boundary_lines.top - possible_boundary_lines.top.shift(1)
possible_boundary_lines
Out[62]:
In [63]:
def get_possible_boundary_lines(label_stats):
possible_boundary_lines = label_stats[(label_stats.width > 1200) & (label_stats.top > 0)]
possible_boundary_lines['dist_from_next_line'] = possible_boundary_lines.top.shift(-1) - possible_boundary_lines.top
possible_boundary_lines['dist_from_prev_line'] = possible_boundary_lines.top - possible_boundary_lines.top.shift(1)
possible_boundary_lines.replace(pd.np.nan, pd.np.inf, inplace=True)
return possible_boundary_lines[(possible_boundary_lines.dist_from_next_line > 100) & (possible_boundary_lines.dist_from_prev_line > 100)]
In [64]:
get_possible_boundary_lines(page2_label_stats)
Out[64]:
In [65]:
get_possible_boundary_lines(label_stats)
Out[65]:
In [66]:
def draw_table_bounding_box(img, label_stats):
possible_boundary_lines = get_possible_boundary_lines(label_stats).reset_index()
waste_copy = img.copy()
tables = []
if len(possible_boundary_lines) % 2 != 0:
raise "Uneven Probable Bounding Lines"
for index, row in enumerate(possible_boundary_lines.iterrows()):
if index % 2 != 0:
table_bounds = {}
top_bound = possible_boundary_lines.iloc[index - 1]
bottom_bound = possible_boundary_lines.iloc[index]
cv2.rectangle(waste_copy, (int(top_bound['left']), int(top_bound['top'])),
(int(bottom_bound['right']) + 50, int(bottom_bound['bottom'])), (125,125,0), 5)
table_bounds['left'], table_bounds['top'] = int(top_bound['left']), int(top_bound['top'])
table_bounds['right'], table_bounds['bottom'] = int(bottom_bound['right']) + 50, int(bottom_bound['bottom'])
tables.append(table_bounds)
plot_page(waste_copy)
return tables
In [67]:
tables = draw_table_bounding_box(img_page2, page2_label_stats)
In [68]:
tables = draw_table_bounding_box(img_page4, label_stats)
Now combine the column output and table boundary to send to tabula and check if it works!!
java -jar parsers/tabula-0.9.2-jar-with-dependencies.jar --pages %s --area %s,%s,%s,%s --columns %s '%s'" % ( page_num+1, table_bounds["top"], table_bounds["left"], table_bounds["bottom"], table_bounds["right"], column_values, input_pdf_filepath)
In [69]:
tables = draw_table_bounding_box(img_page4, label_stats)
column_values = ','.join(['237','1000','1200', '1461', '1750', '2039'])
for table_bounds in tables:
command = "java -jar parsers/tabula-0.9.2-jar-with-dependencies.jar --pages %s --area %s,%s,%s,%s --columns %s '%s'"% ( 5, table_bounds["top"], table_bounds["left"], table_bounds["bottom"], table_bounds["right"], column_values, pdf_file_path)
print(command)
In [70]:
page_table_data = subprocess.check_output(command, shell=True)
In [71]:
page_table_data.split('\n')
Out[71]:
And Tabula isn't giving good results for West Bengal
We are able to get text of blocks using pdftotext from poppler-0.56.0
pdftotext -f 3 -l 3 -x 34 -y 453 -W 110 -H 26 parsers/pdfs/West\ Bengal/2017-18/2017_bp11_Demand\ Nos.1-5.pdf -
x, y, w and h all need to be multiplied with their respective vertical and horizontal ratios for it to work.
Lets explore how we can build rows.
Also need to figure out a better table bounding box technique. The thought is to figure out how to decide the amount content between 2 given lines. Maybe some kind of measure of content within given 2 lines (Ratio of Sum of black pixels and total pixels)
In [72]:
page2_tables = draw_table_bounding_box(img_page2, page2_label_stats)
In [73]:
def get_text(page_num, pdf_file_name, x, y, w, h):
command = 'pdftotext -enc UTF-8 -f {0} -l {0} -x {1} -y {2} -W {3} -H {4} "{5}" -'.format(page_num + 1,
int(x),
int(y),
int(w),
int(h),
pdf_file_name)
return subprocess.check_output(command, shell=True)
def get_text_data(row, page_num, pdf_file_path, horizontal_ratio, vertical_ratio):
x = (row['left'] * horizontal_ratio)
y = (row['top'] * vertical_ratio)
width = (row['width'] * horizontal_ratio) + 5
hieght = (row['height'] * vertical_ratio) + 5
text = get_text(page_num, pdf_file_path, x, y, width, hieght)
row['text'] = text.strip().replace('-','')
row['text_length'] = len(row['text'])
row['possible_row_merger'] = '\n' in row['text']
text_matched = re.findall('[a-zA-Z_]+', row['text'])
comma_sep_matcher = re.compile('^[1-9].(,[0-9]).*$')
if comma_sep_matcher.match(row['text'].replace('\n', ' ')):
row['comma_separated_numbers_present'] = True
else:
row['comma_separated_numbers_present'] = False
if len(text_matched) > 0:
row['is_text'] = True
else:
row['is_text'] = False
try:
row['number'] = int(row['text'].replace(',',''))
except:
row['number'] = None
return row
def mark_number_cells(row):
if row['comma_separated_numbers_present'] or row['is_text'] is False:
row['label'] = 'cell_values'
return row
def mark_text_cells(row, labels):
'''
Detect cells below, left and right. Based on these cells make guesses of labels.
'''
left, right = row['left'], row['right']
top, bottom = row['top'], row['bottom']
if row['label'] != 'cell_values':
if (labels[(labels.right.between(right - 10, right + 10)) &
(labels.top > top) & (labels.label == 'cell_values')].shape[0] > 1):
row['label'] = 'cell_header'
elif (labels[(labels.top.between(top - 10, top + 10)) &
(labels.left > left) & (labels.label == 'cell_values')].shape[0] > 1):
row['label'] = 'cell_grouping'
elif (labels[(labels.bottom.between(bottom - 10, bottom + 10)) &
(labels.left > left) & (labels.label == 'cell_values')].shape[0] > 1):
row['label'] = 'cell_grouping'
return row
def get_possible_horizontal_lines(text_labels):
possible_line_points = text_labels.bottom.value_counts()
possible_line_points = possible_line_points[possible_line_points > 1]
return possible_line_points.index
for table_bounds in page2_tables:
table = rlsa_page2[table_bounds['top']:table_bounds['bottom'],
table_bounds['left']:table_bounds['right']]
labels_in_table = page2_label_stats[(page2_label_stats.top > (table_bounds['top'] + 5)) &
(page2_label_stats.bottom < (table_bounds['bottom'] - 5)) &
(page2_label_stats.left > (table_bounds['left'] - 5)) &
(page2_label_stats.right < (table_bounds['right'] + 5))]
labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[2, pdf_file_path,
horizontal_ratio, vertical_ratio])
text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
# possible_horizontal_lines = get_possible_horizontal_lines(text_labels)
plot_page(rlsa(img_page2[table_bounds['top']:table_bounds['bottom'],
table_bounds['left']:table_bounds['right']], (15, 25)))
plot_page(img_page2)
Though We need to figure out which blocks to split, given the new combined information of text and position can we estimate which blocks should be combined or categorized also.
- One basic rule of thumb for all documents, if number is involved and the block next to it is also a number
we can consider it as a cell.
- If text is involved and it is positioned on the left side somewhere in the middle of the doc or towards
the end it would most probably be a cell.
- For text that are in the top region, directly above cells can be considered as Headers.
- Now the problem of merging and un merging of blocks generated needs to be Resolved, somehow.
- Also another problem to ponder about, groupings or multi level structure - We have already tagged all
In [74]:
text_labels
Out[74]:
There are flaws in the current approach as I am just checking adjacent points based on points, it should rather look for adjacent block with respect to direction and alignment alone, then utilize the information of that block to decide whether the current block is a cell, header or . That should make it more robust.
In [75]:
text_labels[['text', 'label']][pd.isnull(text_labels.label)]
Out[75]:
Currently we are able to detect number based cells, with a minor issue of splitting them wherever required but for this we have already shown that this is detectable in the text also so its not all bad, Cell headers are also working and lowest level of grouping is also working decently and the rule intuitively seems general enough.
What About Multi Level Groupings?
This needs to be detected once we have all the other tags are done. So the idea is that we look for these groupings left to the lowest level of grouping, the assumption is that this grouping will always lie somewhere left to the lower level groupings and this can go upto n'th level though I don't think this would go more then 2 levels in any case because its harder to understand more deeper levels.
This idea, seems general enough and there would be some quirks there, I am pretty sure. But what about grouping labels like "State Legislatures" which itself doesn't have any corresponding value but it encapsulates other groups. Its more like a column name from what I understand though.
lower level grouping
would be its higher grouping.
In [76]:
def combine_headers(row, labels):
left, top = row['left'], row['top']
right, bottom = row['right'], row['bottom']
# labels in a radius of 15
x_pos_axis = labels[(labels.left.between(right - 10, right + 15)) &
(labels.label == 'cell_header')]
x_neg_axis = labels[(labels.right.between(left - 15, left + 10)) &
(labels.label == 'cell_header')]
y_pos_axis = labels[(labels.bottom.between(top - 20, top + 10)) &
(labels.label == 'cell_header')]
y_neg_axis = labels[(labels.top.between(bottom - 10, bottom + 15)) &
(labels.label == 'cell_header')]
if (len(x_pos_axis) + len(x_neg_axis) + len(y_pos_axis) + len(y_neg_axis)) > 0:
if pd.isnull(row['label']):
row['label'] = 'cell_header'
return row
def find_higher_level_groupings(row, cell_grouping_height_range):
bottom, label = row['bottom'], row['label']
min_range, max_range = cell_grouping_height_range
if (bottom > min_range) and (bottom < max_range) and (pd.isnull(label)):
row['label'] = 'cell_higher_grouping'
return row
In [77]:
text_labels.apply(combine_headers, axis=1, args=[text_labels])
Out[77]:
In [78]:
grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])
In [79]:
page4_tables = draw_table_bounding_box(img_page4, label_stats)
for table_bounds in page4_tables:
table = rlsa_eroded_image_page4[table_bounds['top']:table_bounds['bottom'],
table_bounds['left']:table_bounds['right']]
labels_in_table = label_stats[(label_stats.top > (table_bounds['top'] + 5)) &
(label_stats.bottom < (table_bounds['bottom'] - 5)) &
(label_stats.left > (table_bounds['left'] - 5)) &
(label_stats.right < (table_bounds['right'] + 5))]
labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[4, pdf_file_path,
horizontal_ratio, vertical_ratio])
text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])
In [80]:
text_labels[['text', 'label']].to_csv('page4_tags.csv')
there are some odd results, which makes the problem not easily solvable with just heuristics. Maybe we need to think about it from another angle, rather then trying to say what each block is why not focus on the important parts and let everything fall into place.. To expand on this, currently we are able to : -
- Get headers.
- Get Values
Based on these two lets draw the outline structure of the table with rows and columns, after that figure out in which row the other text columns lie, so instead of figuring out groupings, grouping levels, headers why not just dumbify
the problem by trying to put each text column into some row.
Todo :-
In [81]:
def extract_table_attributes(page_num, pdf):
img_page = get_page_image_from_pdf(pdf, page_num, 'west_bengal_demand_1_5_page_1.png')
image_height, image_width = img_page.shape
horizontal_ratio = page_width / image_width
vertical_ratio = page_height / image_height
img_rlsa = rlsa(img_page, (20, 25))
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
label_stats = get_label_stats(stats, centroids)
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height
tables = draw_table_bounding_box(img_page, label_stats)
if len(tables) == 0:
print("No Tables found on page number {0}".format(page_num))
all_text_labels = pd.DataFrame()
index = 1
for table_bounds in tables:
table = img_rlsa[table_bounds['top']:table_bounds['bottom'],
table_bounds['left']:table_bounds['right']]
labels_in_table = label_stats[(label_stats.top > (table_bounds['top'] + 5)) &
(label_stats.bottom < (table_bounds['bottom'] - 5)) &
(label_stats.left > (table_bounds['left'] - 5)) &
(label_stats.right < (table_bounds['right'] + 5))]
labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
horizontal_ratio, vertical_ratio])
text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])
text_labels['page_num'] = page_num
text_labels['table_num'] = index
index += 1
all_text_labels = pd.concat([all_text_labels, text_labels])
return all_text_labels
In [82]:
extract_table_attributes(2, pdf)
Out[82]:
In [83]:
def find_rows(labels):
"""
Figure out the points where rows start and end.
1. Headers would be the starting point. Bottom of the headers
2. Each cell value will be separated by a row. We need to handle where text is merged.
"""
rows = []
rows.extend(labels[labels['label'] == 'cell_header'].aggregate({'top': min, 'bottom': max}).values.flatten())
rows.extend(labels[labels['label'] == 'cell_values']['bottom'].unique())
filtered_rows = []
for index, row_sep in enumerate(rows):
if index > 0:
last_row_sep = rows[index - 1]
if row_sep > (last_row_sep + 20):
filtered_rows.append(row_sep)
else:
filtered_rows.append(row_sep)
return filtered_rows
def plot_horizontal_lines(line_ys, img, x_start, x_end):
waste_copy2 = img.copy()
for line_y in line_ys:
line_starting_points = (int(x_start), int(line_y))
line_stopping_points = (int(x_end), int(line_y))
cv2.line(waste_copy2, line_starting_points, line_stopping_points, (125,255,0), 3)
plot_page(waste_copy2)
def find_common_headers(labels):
unmarked_headers = labels[(labels.label == 'cell_header') & (pd.isnull(labels.header_index))]
if len(unmarked_headers) > 0:
header = unmarked_headers[unmarked_headers.top == unmarked_headers.top.min()]
labels.ix[((labels.label == 'cell_header') &
(labels.left.between(header.left.iloc[0] - header.width.iloc[0], header.right.iloc[0])) &
(pd.isnull(labels.header_index))
), 'header_index'] = header.pos.iloc[0]
return find_common_headers(labels)
return labels
def cleanse_labels(labels):
"""
We need to clean up labels to transform them into csv's
1. Combine Headers so that they don't end up in new lines.
2. Split label blocks with 2 values combined
"""
processed_labels = pd.DataFrame()
for index, row in labels.iterrows():
splitted_row = []
if row.possible_row_merger == True and row.label != 'cell_header':
for index, value in enumerate(row.text.split('\n')):
new_row = {}
for col in row.index:
new_row[col] = row[col]
new_height = row.height // len(row.text.split('\n'))
new_row['height'] = new_height
new_row['top'] = row.top + (index * new_height)
new_row['bottom'] = new_row['top'] + new_height
new_row['text'] = value
new_row['possible_row_merger'] = False
splitted_row.append(new_row)
processed_labels = processed_labels.append(splitted_row)
else:
processed_labels = processed_labels.append(row)
return find_common_headers(processed_labels)
def mark_titles(labels):
labels.ix[((labels.is_text == True) &
(labels.centroid_x.between(1200, 1300)) &
(pd.isnull(labels.label))
), 'label'] = 'title'
return labels
def mark_summary(row):
if row['is_text'] == True:
summaries = re.findall("^[A-z, a-z]*\sRs. [0-9|,]*$", row['text'])
if len(summaries) > 0:
row['label'] = 'cell_summary'
return row
def filter_out_footer(labels):
bottom_boundary = labels[labels.label == 'cell_values']['top'].max()
return labels[labels.bottom < bottom_boundary]
def extract_table_attributes_row_based(page_num, pdf):
img_page = get_page_image_from_pdf(pdf, page_num, 'west_bengal_demand_1_5_page_1.png')
image_height, image_width = img_page.shape
horizontal_ratio = page_width / image_width
vertical_ratio = page_height / image_height
img_rlsa = rlsa(img_page, (20, 25))
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
label_stats = get_label_stats(stats, centroids)
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height
label_stats = label_stats.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
horizontal_ratio, vertical_ratio])
# label_stats = mark_titles(label_stats)
tables = draw_table_bounding_box(img_page, label_stats)
if len(tables) == 0:
print("No Tables found on page number {0}".format(page_num))
all_text_labels = pd.DataFrame()
index = 1
for table_bounds in tables:
table = img_rlsa[table_bounds['top']:table_bounds['bottom'],
table_bounds['left']:table_bounds['right']]
labels_in_table = label_stats[(label_stats.top > (table_bounds['top'] + 5)) &
(label_stats.bottom < (table_bounds['bottom'] - 5)) &
(label_stats.left > (table_bounds['left'] - 5)) &
(label_stats.right < (table_bounds['right'] + 5))]
# labels_in_table = labels_in_table.apply(get_text_data, axis=1, args=[page_num, pdf_file_path,
# horizontal_ratio, vertical_ratio])
text_labels = labels_in_table[labels_in_table.text_length > 0].apply(mark_number_cells, axis=1)
text_labels = text_labels.apply(mark_text_cells, axis=1, args=[text_labels])
grouping_height_range = text_labels[text_labels.label == 'cell_grouping'].aggregate({'top': [min, max]}).values.flatten()
text_labels = text_labels.apply(combine_headers, axis=1, args=[text_labels]).apply(find_higher_level_groupings, axis=1, args=[grouping_height_range])
text_labels['header_index'] = None
text_labels['pos'] = text_labels.index
text_labels = cleanse_labels(text_labels)
text_labels['page_num'] = page_num
text_labels['table_num'] = index
rows = find_rows(text_labels)
row_ranges = zip(rows, rows[1:])
for start, end in row_ranges:
print(text_labels[text_labels.top.between(start, end - 5)].text.values)
plot_horizontal_lines(rows, img_page, 150, 2350)
index += 1
all_text_labels = pd.concat([all_text_labels, text_labels])
break
return all_text_labels
In [84]:
extract_table_attributes_row_based(2, pdf)
In [ ]:
glabel_stats
In [ ]:
extract_table_attributes_row_based(22, pdf)
In [ ]:
extract_table_attributes_row_based(30, pdf)
There is a lot of overlap in the rows and it seems to work well in a couple of pages. Another observation page no 22 has a table separation without any vertical lines, there are two observations on that front :-
- Threre is a huge text block between them
- There is a lot of space between the tables due to the heading
Todo
- filtering out overlapping or close by lines -------x
- do the following label processing : -
- separate out labels where `\n` new line character is present into two labels -------x
- merge all header labels into one ---------x
- a better table detection method.
- a process to convert the information into a consumable csv.
- Detect Titles and store the tables with title names
In [ ]:
label_stats
In [85]:
img_page = get_page_image_from_pdf(pdf, 2, 'west_bengal_demand_1_5_page_1.png')
image_height, image_width = img_page.shape
horizontal_ratio = page_width / image_width
vertical_ratio = page_height / image_height
img_rlsa = rlsa(img_page, (20, 25))
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
label_stats = get_label_stats(stats, centroids)
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height
In [86]:
img_page.shape
Out[86]:
In [87]:
label_stats_with_text = label_stats.apply(get_text_data, axis=1, args=[2, pdf_file_path, horizontal_ratio, vertical_ratio])
In [88]:
label_stats_with_text[label_stats_with_text.is_text == True]
Out[88]:
In [89]:
plot_page(img_page)
In [90]:
label_stats_with_text[(label_stats_with_text.is_text == True) & (label_stats_with_text.centroid_x.between(1200, 1300))]
Out[90]:
In [91]:
label_stats_with_text = label_stats_with_text[label_stats_with_text.text_length > 0].apply(mark_number_cells, axis=1)
label_stats_with_text = label_stats_with_text.apply(mark_text_cells, axis=1, args=[label_stats_with_text])
label_stats_with_text = label_stats_with_text.apply(combine_headers, axis=1, args=[label_stats_with_text])
label_stats_with_text = label_stats_with_text.apply(mark_summary, axis=1)
label_stats_with_text = filter_out_footer(label_stats_with_text)
In [92]:
label_stats_with_text
Out[92]:
In [93]:
label_stats_with_text[['text', 'label']].values
Out[93]:
In [94]:
label_stats_with_text[label_stats_with_text.label == 'cell_values']['top'].max()
Out[94]:
In [95]:
label_stats_with_text[label_stats_with_text.bottom < 3041].shape
Out[95]:
In [96]:
titles = label_stats_with_text[label_stats_with_text.label == 'title']
titles
Out[96]:
In [97]:
titles['next_diff'] = titles.top - titles.top.shift(1)
titles[titles.next_diff > titles.next_diff.mean()]
Out[97]:
In [98]:
label_stats_with_text[label_stats_with_text.top < 1025]
Out[98]:
In [99]:
label_stats_with_text[label_stats_with_text.top >= 1025]
Out[99]:
In [ ]: